In this analysis we set out to quantify a collective narrowing of behavior in the twitter social network. To do this we examine and show evidence for two conjectures:
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import json
from nltk.tokenize import word_tokenize
import re
from collections import Counter
from nltk.corpus import stopwords
import string
import pandas as pd
import pytz
import vincent
import random
from wordcloud import WordCloud
from twython import Twython
import seaborn as sns
import os
import time
from alchemyapi import AlchemyAPI
import preprocessor as p
alchemyapi = AlchemyAPI()
vincent.core.initialize_notebook()
%matplotlib inline
These helper methods help us efficiently analyze the data later on. Note that not all of them are currently in use.
Methods and regular expressions for doing tweet preprocessing
#regex to filter bad strings
regex_str = [
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
#use re to compile the regex
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
def tokenize(s):
"""
Tokenizes a string
-Parameter s: The string to tokenize
-Return: The tokenized string
"""
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
"""
Preprocesses string by tokenizing and applying the emoticon and string regex
Parameter s: The string to preprocess
Return: The list of the processed tokens
"""
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
Methods to aid in the mapping of tweets using leaflet.js
def to_json(data_array):
"""
Converts an array of tweets into a json file that can be used by leaflet.js to create a map and
saves the json file to the directory using the name geo_data.json. Note that this quietly overwrites
the current data_array so be careful using it
-Parameter data_array: The array to be converted
"""
#initialize geo_data json (just a dict here) to feed in to the maps
geo_data = {
"type": "FeatureCollection",
"features": []
}
#populate the json file
for d in data_array:
geo_json_feature = {
"type": "Feature",
"geometry": {"type" : "Point", "coordinates" : d['coordinates']},
"properties": {
"text": d['text'],
"created_at": d['created_at']
}
}
geo_data['features'].append(geo_json_feature)
#write the json out to a file
with open('geo_data.json', 'w') as fout:
fout.write(json.dumps(geo_data, indent=4))
Methods for doing useful computations using vectors in python list form. These methods are adapted from the excellent "Data Science from Scratch" by Joel Grus.
#Note vectors here are denoted by normal python lists
def vector_add(v,w):
"""
Perform a vector addition with two vectors
-Parameter v: The first vector to add
-Parameter w: The second vector to add
-Return: The two vectors added together elementwise
ex. [0,2] + [1,1] = [1,3]
"""
return[v_i + w_i for v_i, w_i in zip(v,w)]
def vector_sum(vectors):
"""
Perform a vector sum with an array of vectors
-Parameter vectors: A list holding the vectors to be added
-Return: The list of vectors added together
ex. [0,2] + [1,1] + [2,2] = [3,5]
"""
result = vectors[0]
for vector in vectors[1:]:
result = vector_add(result, vector)
return result
def vector_subtract(v, w):
"""
Perform a vector subtraction with two vectors as v-w
-Parameter v: The vector to be subtracted from
-Parameter w: The vector to subtract
-Return: The two vectors subtracted elementwise
ex. [1,3] - [1,1] = [0,2]
"""
return [v_i - w_i for v_i, w_i in zip(v,w)]
def scalar_multiply(c, v):
"""
Preform a scalar multiplication on vector v by constant c
-Paramter c: The constant to be multiplied by
-Paramter v: The vector to be scalar multiplied
-Return The vector scalar multiplied by c
ex. 3 * [1,2] = [3,6]
"""
return[c * v_i for v_i in v]
def vector_mean(vectors):
"""
Compute a component wise mean for a list of vectors
-Parameter vectors: The list of vectors to be averaged
-Return a vector whose elements are the componentwise means of the vectors in the vector list
ex. vector_mean([[0,1], [1,1]]) = [0.5,1]
"""
n = len(vectors)
return scalar_multiply(1/n, vector_sum(vectors))
def dot(v, w):
"""
Compute v dot w. Note order does not matter because v dot w = w dot v for all vectors
-Parameter v: a vector to be dotted
-Parameter w: the second vector to be dotted
-Return v dot w
ex. [1,1] dot [2,2] = 4
"""
return sum(v_i * w_i for v_i, w_i in zip(v,w))
def sum_of_squares(v):
"""
Returns the sum of squares for a given vector. Note this is just v dot v
-Parameter v: The vector to be summed
-Return the sum of squares of vector v. Can be sqrted to get vector length
ex. sum_of_squares([2,2]) = 8
"""
return dot(v,v)
def squared_distance(v,w):
"""
Computes the squared distance between two vectors. Note this is just the sum of squares of v-w
-Parameter v: The first vector
-Parameter w: The second vector
-Return The squared distance between two vectors. Can be sqrted to get vector distance
ex. sum_of_squares([2,2],[1,1]) = 2
"""
return sum_of_squares(vector_subtract(v,w))
class KMeans(object):
"""performs k-means clustering"""
def __init__(self, k):
"""
Init the KMeans object
-Parameter k: the number of clusters to find
"""
self.k = k # number of clusters
self.means = None # means of clusters
def classify(self, input_vector):
"""
Return the index of the cluster closest to the input
-Parameter input_vector: the vector to be classified
-Return the cluster closest to the input_vector
"""
return min(range(self.k),
key=lambda i: squared_distance(input_vector, self.means[i]))
def train(self, inputs):
"""
Train the k means object on a given input of vector arrays. Finds K means of clusters in the data
-Parameter inputs: The list of vectors to be trained on
"""
#randomly pick the means to start out with
self.means = random.sample(inputs, self.k)
assignments = None
while True:
# Find new assignments
new_assignments = list(map(self.classify, inputs))
# If no assignments have changed, then we have convergence and are done
if assignments == new_assignments:
return
# Otherwise keep the new assignments,
assignments = new_assignments
for i in range(self.k):
i_points = [p for p, a in zip(inputs, assignments) if a == i]
# avoid divide-by-zero if i_points is empty
if i_points:
self.means[i] = vector_mean(i_points)
Methods to help with analysis and filtration of the data. Note: the bounding boxes are specified in the following form: [left_lower_lat, left_lower_long, right_upper_lat, right_upper_long]
coordinates use the WSG84 system
def plot_bar(data1,data2,title):
"""
Plots a comparative bar graph with x_axis dictated by data1's top 10 hashtags
-Parameter data1: The counter object holding the counts for the data who's top 10 tweets dictate the x-axis
-Parameter data2: The counter object holding the counts for the data who's counts are to be matched to data1's
top 10 hashtags
"""
#The array to hold the matching twitter hashtag counts
match = []
#Get the top 10 most common hashtags from the first data set
datums = data1.most_common(10)
#Sort the datums with most common first
datums.sort(key=lambda x: x[1], reverse=True)
#populate the match array with corresponding tweets
for key,score in datums:
match.append((key,data2[key]))
#create the ordered list of hashtags for axis labeling
hashtag = list(zip(*datums))[0]
#zip up the data for the match and original hashtag scores into arrays
score = list(zip(*datums))[1]
score2 = list(zip(*match))[1]
#arrange the hashtags in order
x_pos = np.arange(len(hashtag))
#put all the data into a pandas data frame and write out (used for external plotting)
# data = {'hashtag': hashtag,
# 'local':score,
# 'national':score2}
# df = pd.DataFrame(data, columns=['hashtag', 'local', 'national'])
# print(df.to_csv(index=False))
#put local and national data into a pandas data frame
plt.figure(figsize=(20, 10))
data = {'local':score,'national':score2}
df = pd.DataFrame(data, columns=['local', 'national'])
#plot the data
df.plot.bar()
plt.xticks(x_pos, hashtag)
plt.title(title)
plt.ylabel('Number of Tweets')
def word_cloud(counter):
"""
Creates a Word Cloud using the top 100 terms of the passed counter using the WordCloud api
-Parameter counter: The counter object to be made into a word cloud
"""
text = ""
for key,score in counter.most_common(100):
for x in range(0,score):
text += key + " "
wordcloud = WordCloud().generate(text)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud)
plt.axis("off")
def filter_geo(d_array, loc):
"""
Filters a given array of tweets to only include tweets within a the given bounding box.
-Parameter d_array: The array of tweets to be filtered
-Parameter loc: The bounding box to be filtered
-Return: A list containing the filtered tweets
"""
geo_array = []
for dd in d_array:
if(checkBox(dd,loc)):
#append the data point to the data array
geo_array.append(dd)
return geo_array
def count_terms(d_array):
"""
Counts the number of hashtags in the passed tweet array using a collections counter
-Parameter d_array: The array of tweets to be counted
-Return: The counter object holding the final counts
"""
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['via', 'I\'m', 'I', '😂', 'like','get','don\'t']
count = Counter()
for d in d_array:
terms_hash = [term for term in preprocess(d['text']) if term.startswith('#')]
count.update(terms_hash)
del count['#']
return count
def rand_sample(n , dat_arr):
"""
Take an order maintaining random sample of size n from a passed array
-Parameter n: The size of the sample to be returned
-Parameter dat_arr: The array to be sampled from
-Return: The order maintained random sample
"""
rand_smpl = [ dat_arr[i] for i in sorted(random.sample(range(len(dat_arr)), n))]
return rand_smpl
#Bounding Boxes of the most populous cities in the US and a few others to get a better spread
nyc = [-74,40,-73,41]
san_fran = [-122.75,36.8,-121.75,37.8]
la = [-118.668176,33.703692,-118.155289,34.337306]
chicago = [-87.940267,41.644335,-87.524044,42.023131]
houston = [-95.788087,29.523624,-95.014496,30.110732]
philadelphia = [-75.280303,39.871514,-74.955763,40.137992]
phoenix = [-112.324056,33.29026,-111.926046,33.92057]
san_antonio = [-98.805851,29.224141,-98.222958,29.73872]
san_diego = [-117.282167,32.534856,-116.90816,33.114249]
dallas = [-98.2538,31.9899,-95.4461,33.6828]
san_jose = [-122.045668,37.124493,-121.589154,37.469538]
seattle = [-122.435908,47.495551,-122.235903,47.734145]
denver = [-105.109927,39.614431,-104.600296,39.914247]
nashville = [-87.054903,35.967785,-86.515588,36.405496]
jacksonville = [-82.049502,30.103748,-81.391412,30.586232]
tampa = [-82.906952,27.616014,-82.259741,28.171478]
dc = [-77.6001,38.5294,-76.5106,39.1381]
atlanta = [-84.9008,33.3635,-83.8162,34.2041]
def checkBox(d, box):
"""
Checks if a given tweet is inside a bounding box
-Parameter d: The tweet to be checked
-Parameter box: The bounding box to be checked
-Return: True if the tweet is inside the box. False if it is outside
"""
x = d['coordinates'][0]
y = d['coordinates'][1]
if(x < box[0]):
return False
if(y < box[1]):
return False
if(x >= box[2]):
return False
if(y >= box[3]):
return False
return True
This is where we start analyzing the data. The data set being analyzed is a set of about 4 million geo-tagged tweets captured between 19:30 on 4/27/16 to 10:30 on 4/28/16 using Twitter's streaming api. Code to capture the tweets can be found in /twitterapi.py.
We start by unpickling the tweet file and dumping all of the tweets into the data_array. Here we blindly filter out all tweets with no or with blacklisted hashtags. Further work would swap out this wide spread ban with entity analysis to pick out topics without relying on hashtags. While filtering out all non-hashtagged tweets does mean we lose a lot of our data, previous work has shown that hashtags correlate very well with retweet rate, which means we still get all of the important tweets (Suh, Bongwon, et al.).
#we filter these hashtags because they seem to dominate everywhere and are not very useful tweets to look at because
#they are just advertisements
bad_hashtags = ['#Hiring','#job','#Job','#Jobs','#CareerArc']
f = open('/Users/calvin/Documents/Lehigh/English/Research/data/cap4.pkl', 'rb')
data_array = []
count = 0
while True:
try:
dd = pkl.load(f)
except EOFError:
break
except Exception:
print(count)
count += 1
unpickler.load()
continue
else:
#If the coordinates specified are just a bounding box then take the mean of the bounding box coordinates
#and specify that as the actual coordinates
if dd['coordinates'] == None:
if dd['place'] == None:
continue
dd['coordinates'] = vector_mean(dd['place']['bounding_box']['coordinates'][0])
else:
#account for edge case where coordinates are wrapped
dd['coordinates'] = dd['coordinates']['coordinates']
#filter out all tweets with no hashtags and take out word related hashtags because those are unhelpful
#TODO- we can keep more data if we can do a sentiment analysis on the tweets instead of just filtering
if '#' in dd['text'] and (not any(hashtag in dd['text'] for hashtag in bad_hashtags)):
data_array.append(dd)
Here we apply location based filtering to the preprocessed dataset. My city choices were motivated by first choosing cities with the highest population and tweet densities, and then altering my choices as needed to get better coverage of the enitre nation. The cities I have choosen to look at are:
nyc_geo_data = filter_geo(data_array, nyc)
sf_geo_data = filter_geo(data_array, san_fran)
la_geo_data = filter_geo(data_array, la)
ch_geo_data = filter_geo(data_array, chicago)
sea_geo_data = filter_geo(data_array, seattle)
tamp_geo_data = filter_geo(data_array, tampa)
dallas_geo_data = filter_geo(data_array, dallas)
dc_geo_data = filter_geo(data_array, dc)
atlanta_geo_data = filter_geo(data_array, atlanta)
phoenix_geo_data = filter_geo(data_array, phoenix)
denver_geo_data = filter_geo(data_array, denver)
Here we take a random sample of the data array to serve as our national control. Note that because we filter out so many tweets and out dataset is not very large, I have concluded that for this analysis it is okay if our randomly sampled array is the same size of the original data array. If the data set is larger, then it would be prudent for space and statistical reasons to have a smaller national dataset.
n = len(data_array)
#create a random sample of size n form the data_array
rand_smpl = rand_sample(n,data_array)
Here we graph the top 10 most common hashtags for each of the chosen cities. We find that local trending hashtags have little to no correlation with national trending hashtags. In some cases, such as NYC, we find that the two numbers have almost negative correlation. It also becomes clear that the top local trending hashtags often account for a large proportion of that hashtag's total usage. A nicer visualization of this data can be found in /Multimodal_Project/index.html. We also create word clouds with the top 100 hashtags to see more depth.
count_rand = count_terms(rand_smpl)
datums = count_rand.most_common(10)
#Sort the datums with most common first
datums.sort(key=lambda x: x[1], reverse=True)
#create the ordered list of hashtags for axis labeling
hashtag = list(zip(*datums))[0]
#zip up the data for the match and original hashtag scores into arrays
score = list(zip(*datums))[1]
#arrange the hashtags in order
x_pos = np.arange(len(hashtag))
#put local and national data into a pandas data frame
plt.figure(figsize=(20, 10))
plt.bar(x_pos,score,align='center')
plt.xticks(x_pos, hashtag)
plt.title("National Top 10 Hashtags")
plt.ylabel('Number of Tweets')
word_cloud(count_rand)
count_nyc = count_terms(nyc_geo_data)
plot_bar(count_nyc,count_rand,"NYC Top 10 Hashtags")
word_cloud(count_nyc)
count_sf = count_terms(sf_geo_data)
plot_bar(count_sf,count_rand,"San Francisco Top 10 Hashtags")
word_cloud(count_sf)
count_la = count_terms(la_geo_data)
plot_bar(count_la,count_rand,"Los Angeles Top 10 Hashtags")
word_cloud(count_la)
count_ch = count_terms(ch_geo_data)
plot_bar(count_ch,count_rand,"Chicago Top 10 Hashtags")
word_cloud(count_ch)
count_sea = count_terms(sea_geo_data)
plot_bar(count_sea,count_rand,"Seattle Top 10 Hashtags")
word_cloud(count_sea)
count_tamp = count_terms(tamp_geo_data)
plot_bar(count_tamp,count_rand,"Tampa Top 10 Hashtags")
word_cloud(count_tamp)
count_dallas = count_terms(dallas_geo_data)
plot_bar(count_dallas,count_rand,"Dallas Top 10 Hashtags")
word_cloud(count_dallas)
count_phoenix = count_terms(phoenix_geo_data)
plot_bar(count_phoenix,count_rand,"Phoenix Top 10 Hashtags")
word_cloud(count_phoenix)
count_denver = count_terms(denver_geo_data)
plot_bar(count_denver,count_rand,"Denver Top 10 Hashtags")
word_cloud(count_denver)
count_atlanta = count_terms(atlanta_geo_data)
plot_bar(count_atlanta,count_rand,"Atlanta Top 10 Hashtags")
word_cloud(count_atlanta)
count_dc = count_terms(dc_geo_data)
plot_bar(count_dc,count_rand,"Washington DC Top 10 Hashtags")
word_cloud(count_dc)
Here we explore the effect of tweet location on the number of tweet interactions. We use the Twitter Rest API to get the favorite and retweet counts of each of the tweets with either the hashtag Trump or Trump2016. Then we define an interaction metric as simply the sum of the number of likes and retweets. Then we look at the average number of interactions per examined area and compare those results with sentiment analysis from the Alchemy API and primary results from the New York Times. This work has not progressed as far as I wanted it to because I am only allowed to call the rest api 60 times ever 15 minutes, while I have thousands of tweets to look through.
def load_keys():
"""
Loads the API keys for the twitter REST API
"""
keys = []
path = os.path.join('.keys')
f = open(path, 'r')
for line in f:
keys.append(str(line).replace('\n',''))
return keys
#use twython to create a rest api client
keys = load_keys()
twitter = Twython(keys[0], keys[1],
keys[2], keys[3])
#filter different geo arays so that the array filtered only contains tweets with #trump2016 or #trump
filtered = []
for d in rand_smpl:
if "#Trump2016" in d['text'] or "#Trump" in d['text']:
filtered.append(d)
#if we have greater than 60 tweets, take a random sample and use that instead
try:
filtered = rand_sample(60, filtered)
except Exception:
pass
ave = 0
t_count = 0
c = 0
#iterate though the tweets and add up the retweet and favorite counts
while c < len(filtered):
try:
rt = twitter.get_retweets(id=filtered[c]['id'])
if len(rt) > 0 and rt[0]['retweet_count'] < 10 and rt[0]['favorite_count'] < 10:
t_count += 1
ave += rt[0]['retweet_count'] + rt[0]['favorite_count']
else:
pass
c += 1
except Exception:
#If we are out of calls then wait until we have calls again
if twitter.get_application_rate_limit_status()['resources']['statuses']['/statuses/retweets/:id']['remaining'] == 0:
print("RATE EXCEEDED")
re_time = twitter.get_application_rate_limit_status()['resources']['statuses']['/statuses/retweets/:id']['reset']
curr_time = time.time()
time.sleep(re_time - curr_time + 5)
else:
# print("Other Error")
c += 1
pass
#log the average number of interactions
print(ave/t_count)
#helper function to see how many calls I have left
twitter.get_application_rate_limit_status()['resources']['statuses']['/statuses/retweets/:id']
Now we use the Alchemy API to get the average tweet sentiment score for each geo location. I would like to spend more time preprocessing the tweet text to get better sentiment results. That being said, that is a hard and arbitrary problem, so for now I send the tweets in with minimal filtering.
#filter the geo data for tweets containing the target hashtag
filtered = []
for d in tamp_geo_data:
if "#Trump2016" in d['text'] or "#Trump" in d['text']:
filtered.append(d)
print(len(filtered))
#Obtain the average tweet sentiment score for the filtered array
text = []
sentiment = []
s = ""
for d in filtered:
cleaned = p.clean(d['text'].lower())
response = alchemyapi.sentiment('text',cleaned)
# s += cleaned + " "
# text.append(cleaned)
if response['status'] != 'OK':
break
if response['docSentiment']['type'] == 'neutral':
sentiment.append(0)
else:
sentiment.append(float(response['docSentiment']['score']))
ave = np.mean(np.array(sentiment))
print(ave)
Because of the limits of the REST and Alchemy apis , I had to run the code separatly for each of the geo locations. Below I take the data gathered and put it into a pandas dataframe for later use.
#[average retweet rates,sentiment score]
Chicago = [2.7142857142857144, 0.0672747]
Tampa = [2.7142857142857144, -0.107008]
NYC = [2.0714285714285716, 0.0666163]
SF = [2.25, -0.351972]
Phoenix = [2.0625,0.000326956]
Denver = [3.0,-0.562185]
Atlanta = [1.2,0.0837211]
DC = [1.8333333333333333,-0.0416102]
Dallas = [3.0714285714285716,0.0199777]
LA = [1.9230769230769231,0.0681025]
sea = [2.4,-0.164262] #no positive tweets
national = [2.3076923076923075,0.128462]
locations = ["Chicago", "Tampa", "NYC", "SF", "Phoenix",
"Denver", "Atlanta", "DC", "Dallas", "LA",
"Seattle", "National"]
retweet_fav = [2.7142857142857144,2.7142857142857144,2.0714285714285716,
2.25,2.0625,3.0,1.2,1.8333333333333333,3.0714285714285716,
1.9230769230769231,2.4,2.3076923076923075]
sentiment_score = [0.0672747,-0.107008,0.0666163,-0.351972,0.000326956,
-0.562185,0.0837211,-0.0416102,0.0199777,0.0681025,
-0.164262,0.128462]
data = {'locations': locations,
'interactions' : retweet_fav,
'sentiment':sentiment_score }
df = pd.DataFrame(data, columns=['locations', 'interactions', 'sentiment'])
print(df)
Here it becomes clear that the number of retweets your get on you tweet is highly variable based on where you tweeted from.
plt.figure(figsize=(20,10))
plt.bar(range(12), df.interactions, align='center')
plt.ylabel("Average Number of Retweets and Favorites")
plt.xticks(range(12), df.locations)
Here we see that the more negative the tweet is, the more interactions it gets. This hints that people respond most often to radical negative ideas.
# x = retweet_fav
# y = sentiment_score
plt.figure(figsize=(20,10))
sns.regplot(x='interactions', y='sentiment', data=df)
for i, txt in enumerate(df.locations):
plt.annotate(txt, (df.interactions[i],df.sentiment[i]))
locations = ["Chicago", "Tampa", "NYC", "Phoenix", "Atlanta", "DC", "Dallas", "National"]
retweet_fav = [2.7142857142857144,2.7142857142857144,2.0714285714285716,
2.25,1.2,1.8333333333333333,3.0714285714285716, 2.3076923076923075]
primary_rates = [0.47,0.464,0.418,0.41,0.266,0.13,0.248,0.418]
data = {'locations': locations,
'interactions' : retweet_fav,
'primary':primary_rates}
df = pd.DataFrame(data, columns=['locations', 'interactions','primary'])
Here we graph the total number of interactions by the district primary results from the New York Times. We find that it is very linear with only a few outliers. If primary results are reckoned as a metric of Trump support, then this shows that unpopular opinions get less interactions than popular opinions. This line also means it might be possible to guess primary results based on twitter interactions, although I hesitate to make this claim because my data set is not very large.
plt.figure(figsize=(20,10))
sns.regplot(x='interactions', y='primary', data=df)
for i, txt in enumerate(df.locations):
plt.annotate(txt, (df.interactions[i],df.primary[i]))
Here we analyze a set of 4 million tweets scraped between 19:30 on 4/27/16 to 10:30 on 4/28/16. We set out to explore two conjectures, local trends are not correlated with national trends and Tweet interactions of similar tweets vary depending on where and how they were tweeted. We find that local and national trending hashtags are uncorrelated and sometimes negatively correlated, and that in many cases the top local trending hashtags account for a significant portion of the total hashtag use. We also find that the number of interactions one gets on a tweet is coorelated with both it's sentiment and it's location. These results indicate that twitter's trending algorithm creates in it's users a collective narrowing of attention in both ideological and geographical space.
Mauskopf, Sara. "Tailored Trends Bring You Closer | Twitter Blogs." Tailored Trends Bring You Closer | Twitter Blogs. Twitter, 12 June 2012. Web. 02 May 2016.
Grus, Joel. Data Science from Scratch: First Principles with Python. O'Reilly, 2015. Print. 02 May 2016.
Suh, Bongwon, et al. "Want to be retweeted? large scale analytics on factors impacting retweet in twitter network." Social computing (socialcom), 2010 ieee second international conference on. IEEE, 2010. 02 May 2016.
Wang, Xiaolong, et al. "Topic sentiment analysis in twitter: a graph-based hashtag sentiment classification approach." Proceedings of the 20th ACM international conference on Information and knowledge management. ACM, 2011. 02 May 2016.
"Election 2016." New York Times. New York Times, 2 May 2016. Web. 2 May 2016.
Below we see the method I used to filter the tweets based on hashtag and then put them into a json format, which can be mapped by leaflet.js. In the future I want to continue trying to quantify tweet groupings using the K-Means algorithm and predetermined acceptable values of error. There is also code for examining time series data.
filtered = []
for d in rand_smpl:
if "#Trump2016" in d['text']:
filtered.append(d)
print(len(filtered))
to_json(filtered)
# dates = []
# eastern = pytz.timezone('US/Eastern')
# for d in filtered:
# dates.append(d['created_at'])
# ones = [1]*len(dates)
# idx = pd.DatetimeIndex(dates).tz_localize(pytz.utc).tz_convert(eastern)
# dates_series = pd.Series(ones, index=idx)
# # Resampling / bucketing
# per_minuite = dates_series.resample('1Min', how='sum').fillna(0)
# time_chart = vincent.Line(per_minuite)
# time_chart.axis_titles(x='Time', y='Freq')